/*!
 * \file cuda_multicorrelator.cu
 * \brief Highly optimized CUDA GPU vector multiTAP correlator class
 * \authors <ul>
 *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
 *          </ul>
 *
 * Class that implements a highly optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
 *
 * -----------------------------------------------------------------------------
 *
 * Copyright (C) 2010-2020  (see AUTHORS file for a list of contributors)
 *
 * GNSS-SDR is a software defined Global Navigation
 *          Satellite Systems receiver
 *
 * This file is part of GNSS-SDR.
 *
 * SPDX-License-Identifier: GPL-3.0-or-later
 *
 * -----------------------------------------------------------------------------
 */

#include "cuda_multicorrelator.h"
#include <iostream>
#include <stdio.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

#define ACCUM_N 128

__global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
    GPU_Complex *d_corr_out,
    GPU_Complex *d_sig_in,
    GPU_Complex *d_sig_wiped,
    GPU_Complex *d_local_code_in,
    float *d_shifts_chips,
    int code_length_chips,
    float code_phase_step_chips,
    float rem_code_phase_chips,
    int vectorN,
    int elementN,
    float rem_carrier_phase_in_rad,
    float phase_step_rad)
{
    //Accumulators cache
    __shared__ GPU_Complex accumResult[ACCUM_N];

    // CUDA version of floating point NCO and vector dot product integrated
    float sin;
    float cos;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < elementN;
         i += blockDim.x * gridDim.x)
        {
            __sincosf(rem_carrier_phase_in_rad + i * phase_step_rad, &sin, &cos);
            d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos, -sin);
        }

    __syncthreads();
    ////////////////////////////////////////////////////////////////////////////
    // Cycle through every pair of vectors,
    // taking into account that vector counts can be different
    // from total number of thread blocks
    ////////////////////////////////////////////////////////////////////////////
    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
        {
            //int vectorBase = IMUL(elementN, vec);
            //int vectorEnd  = elementN;

            ////////////////////////////////////////////////////////////////////////
            // Each accumulator cycles through vectors with
            // stride equal to number of total number of accumulators ACCUM_N
            // At this stage ACCUM_N is only preferred be a multiple of warp size
            // to meet memory coalescing alignment constraints.
            ////////////////////////////////////////////////////////////////////////
            for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
                {
                    GPU_Complex sum = GPU_Complex(0, 0);
                    float local_code_chip_index = 0.0;
                    ;
                    //float code_phase;
                    for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
                        {
                            //original sample code
                            //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
                            //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
                            //sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);

                            //custom code for multitap correlator
                            // 1.resample local code for the current shift

                            local_code_chip_index = fmodf(code_phase_step_chips * __int2float_rd(pos) + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);

                            //Take into account that in multitap correlators, the shifts can be negative!
                            if (local_code_chip_index < 0.0) local_code_chip_index += (code_length_chips - 1);
                            //printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
                            // 2.correlate
                            sum.multiply_acc(d_sig_wiped[pos], d_local_code_in[__float2int_rd(local_code_chip_index)]);
                        }
                    accumResult[iAccum] = sum;
                }

            ////////////////////////////////////////////////////////////////////////
            // Perform tree-like reduction of accumulators' results.
            // ACCUM_N has to be power of two at this stage
            ////////////////////////////////////////////////////////////////////////
            for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
                {
                    __syncthreads();

                    for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
                        {
                            accumResult[iAccum] += accumResult[stride + iAccum];
                        }
                }

            if (threadIdx.x == 0)
                {
                    d_corr_out[vec] = accumResult[0];
                }
        }
}


bool cuda_multicorrelator::init_cuda_integrated_resampler(
    int signal_length_samples,
    int code_length_chips,
    int n_correlators)
{
    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    //    findCudaDevice(argc, (const char **)argv);
    cudaDeviceProp prop;
    int num_devices, device;
    cudaGetDeviceCount(&num_devices);
    if (num_devices > 1)
        {
            int max_multiprocessors = 0, max_device = 0;
            for (device = 0; device < num_devices; device++)
                {
                    cudaDeviceProp properties;
                    cudaGetDeviceProperties(&properties, device);
                    if (max_multiprocessors < properties.multiProcessorCount)
                        {
                            max_multiprocessors = properties.multiProcessorCount;
                            max_device = device;
                        }
                    printf("Found GPU device # %i\n", device);
                }
            //cudaSetDevice(max_device);

            //set random device!
            selected_gps_device = rand() % num_devices;  //generates a random number between 0 and num_devices to split the threads between GPUs
            cudaSetDevice(selected_gps_device);

            cudaGetDeviceProperties(&prop, max_device);
            //debug code
            if (prop.canMapHostMemory != 1)
                {
                    printf("Device can not map memory.\n");
                }
            printf("L2 Cache size= %u \n", prop.l2CacheSize);
            printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock);
            printf("maxGridSize= %i \n", prop.maxGridSize[0]);
            printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock);
            printf("deviceOverlap= %i \n", prop.deviceOverlap);
            printf("multiProcessorCount= %i \n", prop.multiProcessorCount);
        }
    else
        {
            cudaGetDevice(&selected_gps_device);
            cudaGetDeviceProperties(&prop, selected_gps_device);
            //debug code
            if (prop.canMapHostMemory != 1)
                {
                    printf("Device can not map memory.\n");
                }

            printf("L2 Cache size= %u \n", prop.l2CacheSize);
            printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock);
            printf("maxGridSize= %i \n", prop.maxGridSize[0]);
            printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock);
            printf("deviceOverlap= %i \n", prop.deviceOverlap);
            printf("multiProcessorCount= %i \n", prop.multiProcessorCount);
        }

    // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));

    // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
    size_t size = signal_length_samples * sizeof(GPU_Complex);

    // ******** ZERO COPY VERSION ************
    // Set flag to enable zero copy access
    // Optimal in shared memory devices (like Jetson K1)
    // cudaSetDeviceFlags(cudaDeviceMapHost);

    // ******* CudaMalloc version ***********

    // input signal GPU memory (can be mapped to CPU memory in shared memory devices!)
    //    cudaMalloc((void **)&d_sig_in, size);
    //    cudaMemset(d_sig_in,0,size);

    // Doppler-free signal (internal GPU memory)
    cudaMalloc((void **)&d_sig_doppler_wiped, size);
    cudaMemset(d_sig_doppler_wiped, 0, size);

    // Local code GPU memory (can be mapped to CPU memory in shared memory devices!)
    cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>) * code_length_chips);
    cudaMemset(d_local_codes_in, 0, sizeof(std::complex<float>) * code_length_chips);

    d_code_length_chips = code_length_chips;

    // Vector with the chip shifts for each correlator tap
    //GPU memory (can be mapped to CPU memory in shared memory devices!)
    cudaMalloc((void **)&d_shifts_chips, sizeof(float) * n_correlators);
    cudaMemset(d_shifts_chips, 0, sizeof(float) * n_correlators);

    //scalars
    //cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
    //cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);

    // Launch the Vector Add CUDA Kernel
    // TODO: write a smart load balance using device info!
    threadsPerBlock = 64;
    blocksPerGrid = 128;  //(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;

    cudaStreamCreate(&stream1);
    //cudaStreamCreate (&stream2) ;
    return true;
}


bool cuda_multicorrelator::set_local_code_and_taps(
    int code_length_chips,
    const std::complex<float> *local_codes_in,
    float *shifts_chips,
    int n_correlators)
{
    cudaSetDevice(selected_gps_device);
    // ******** ZERO COPY VERSION ************
    //    // Get device pointer from host memory. No allocation or memcpy
    //    cudaError_t code;
    //    // local code CPU -> GPU copy memory
    //    code=cudaHostGetDevicePointer((void **)&d_local_codes_in,  (void *) local_codes_in, 0);
    //    if (code!=cudaSuccess)
    //    {
    //        printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
    //    }
    //    // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
    //    code=cudaHostGetDevicePointer((void **)&d_shifts_chips,  (void *) shifts_chips, 0);
    //    if (code!=cudaSuccess)
    //    {
    //        printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
    //    }

    // ******* CudaMalloc version ***********
    //local code CPU -> GPU copy memory
    cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex) * code_length_chips, cudaMemcpyHostToDevice, stream1);
    d_code_length_chips = code_length_chips;

    //Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
    cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float) * n_correlators,
        cudaMemcpyHostToDevice, stream1);

    return true;
}


bool cuda_multicorrelator::set_input_output_vectors(
    std::complex<float> *corr_out,
    std::complex<float> *sig_in)
{
    cudaSetDevice(selected_gps_device);
    // Save CPU pointers
    d_sig_in_cpu = sig_in;
    d_corr_out_cpu = corr_out;

    // Zero Copy version
    // Get device pointer from host memory. No allocation or memcpy
    cudaError_t code;
    code = cudaHostGetDevicePointer((void **)&d_sig_in, (void *)sig_in, 0);
    code = cudaHostGetDevicePointer((void **)&d_corr_out, (void *)corr_out, 0);
    if (code != cudaSuccess)
        {
            printf("cuda cudaHostGetDevicePointer error \r\n");
        }
    return true;
}

#define gpuErrchk(ans)                        \
    {                                         \
        gpuAssert((ans), __FILE__, __LINE__); \
    }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
        {
            fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
}


bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
    float rem_carrier_phase_in_rad,
    float phase_step_rad,
    float code_phase_step_chips,
    float rem_code_phase_chips,
    int signal_length_samples,
    int n_correlators)
{
    cudaSetDevice(selected_gps_device);
    // cudaMemCpy version
    //size_t memSize = signal_length_samples * sizeof(std::complex<float>);
    // input signal CPU -> GPU copy memory
    //cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize,
    //                               cudaMemcpyHostToDevice, stream2);
    // **** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****

    //launch the multitap correlator with integrated local code resampler!

    Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(
        d_corr_out,
        d_sig_in,
        d_sig_doppler_wiped,
        d_local_codes_in,
        d_shifts_chips,
        d_code_length_chips,
        code_phase_step_chips,
        rem_code_phase_chips,
        n_correlators,
        signal_length_samples,
        rem_carrier_phase_in_rad,
        phase_step_rad);

    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaStreamSynchronize(stream1));

    // cudaMemCpy version
    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    //scalar products (correlators outputs)
    //cudaMemcpyAsync(d_corr_out_cpu, d_corr_out, sizeof(std::complex<float>)*n_correlators,
    //        cudaMemcpyDeviceToHost,stream1);
    return true;
}


cuda_multicorrelator::cuda_multicorrelator()
{
    d_sig_in = NULL;
    d_nco_in = NULL;
    d_sig_doppler_wiped = NULL;
    d_local_codes_in = NULL;
    d_shifts_samples = NULL;
    d_shifts_chips = NULL;
    d_corr_out = NULL;
    threadsPerBlock = 0;
    blocksPerGrid = 0;
    d_code_length_chips = 0;
}


bool cuda_multicorrelator::free_cuda()
{
    // Free device global memory
    if (d_sig_in != NULL) cudaFree(d_sig_in);
    if (d_nco_in != NULL) cudaFree(d_nco_in);
    if (d_sig_doppler_wiped != NULL) cudaFree(d_sig_doppler_wiped);
    if (d_local_codes_in != NULL) cudaFree(d_local_codes_in);
    if (d_corr_out != NULL) cudaFree(d_corr_out);
    if (d_shifts_samples != NULL) cudaFree(d_shifts_samples);
    if (d_shifts_chips != NULL) cudaFree(d_shifts_chips);
    // Reset the device and exit
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    return true;
}
